0. Packages, functions & global variables

Packages
library(data.table)
library(ape)
library(phylobase)
library(phytools)
library(phangorn)
library(stringr)
library(geiger)
library(RSvgDevice)
library(reshape2)
library(ggplot2)
library(RSvgDevice)
Functions
# Annotate branches function. This is used to label a particular branch with
# something... #
annotate_transfer_num <- function(edge_entry, index, colName) {
    edge <- edge_entry[1:2]
    edge_label <- as.character(round(edge_entry[, colName], digits = 2))
    # print(edge_label)
    
    edgelabels(edge_label, index, adj = c(0.5, -0.25), bg = "white", frame = "none", 
        cex = 1)
}

# Plot particular branches on the phylogeny #
PlotSegmentByNodes <- function(nodes0, nodes1, colour) {
    lastPP <- get("last_plot.phylo", envir = .PlotPhyloEnv)
    if (length(nodes0) != length(nodes1)) {
        tmp <- cbind(nodes0, nodes1)
        nodes0 <- tmp[, 1]
        nodes1 <- tmp[, 2]
    }
    x0 <- lastPP$xx[nodes0]
    y0 <- lastPP$yy[nodes0]
    x1 <- lastPP$xx[nodes1]
    y1 <- lastPP$yy[nodes1]
    segments(x0, y0, x0, y1, lty = 1, lwd = 3, col = colour)
    segments(x0, y1, x1, y1, lty = 1, lwd = 3, col = colour)
}

# Calculate all the relevant correlations for a given transfer column #
CalcCorrLengthTransfers <- function(full_df, transfer_column, outlier_index) {
    full_no_outlier_df <- full_df[!(full_df$Index == outlier_index), ]
    
    # Correlations - make df and fill it with different combinations #
    cor_df <- data.frame(Condition = character(), Pearson = double(), Spearman = double(), 
        Rsquared = double(), stringsAsFactors = FALSE)
    
    # All together - outlier kept #
    cor_df <- rbind(cor_df, data.frame(Condition = "All with outlier", Pearson = cor(full_df$Branch_len, 
        full_df[, transfer_column], method = "pearson"), Spearman = cor(full_df$Branch_len, 
        full_df[, transfer_column], method = "spearman"), Rsquared = summary(lm(full_df$Branch_len ~ 
        full_df[, transfer_column]))$r.squared))
    
    # All together - outlier removed #
    cor_df <- rbind(cor_df, data.frame(Condition = "All no outlier", Pearson = cor(full_no_outlier_df$Branch_len, 
        full_no_outlier_df[, transfer_column], method = "pearson"), Spearman = cor(full_no_outlier_df$Branch_len, 
        full_no_outlier_df[, transfer_column], method = "spearman"), Rsquared = summary(lm(full_no_outlier_df$Branch_len ~ 
        full_no_outlier_df[, transfer_column]))$r.squared))
    
    # Split by Subgroup # Intergroup #
    cor_df <- rbind(cor_df, data.frame(Condition = "Intergroup no outlier", 
        Pearson = cor(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup == 
            TRUE)], full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup == 
            TRUE)], method = "pearson"), Spearman = cor(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup == 
            TRUE)], full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup == 
            TRUE)], method = "spearman"), Rsquared = summary(lm(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup == 
            TRUE)] ~ full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup == 
            TRUE)]))$r.squared))
    
    # Intragroup #
    cor_df <- rbind(cor_df, data.frame(Condition = "Intragroup no outlier", 
        Pearson = cor(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup == 
            FALSE)], full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup == 
            FALSE)], method = "pearson"), Spearman = cor(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup == 
            FALSE)], full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup == 
            FALSE)], method = "spearman"), Rsquared = summary(lm(full_no_outlier_df$Branch_len[!(full_no_outlier_df$Subgroup == 
            FALSE)] ~ full_no_outlier_df[, transfer_column][!(full_no_outlier_df$Subgroup == 
            FALSE)]))$r.squared))
    
    cor_df[, 2:4] <- cbind(round(cor_df$Pearson, digits = 4), round(cor_df$Spearman, 
        digits = 4), round(cor_df$Rsquared, digits = 4))
    return(cor_df)
}
Global variables
## Variables used globally ##
penalties = c(4:6)
min_taxa = 0

single_pen_plot = TRUE
single_pen_val = 5

if (min_taxa > 4) {
    file_name_add <- paste0("_min_", min_taxa)
} else {
    file_name_add <- ""
}

1. Compare the (Anoxy)Geobacillus phylogenies

Here we load and compare the phylogenies derived from the all-species cladogram in the Mowgli analysis and the concatenated vertical protein alignment (time-proxy). We want to take the transfers assigned to branches on the species cladogram and plot them on the time-resolved tree. For this have to map the corresponding branches from the species cladorgram to the time-resolved tree and identify any incongruences.

1. Species tree (clado)

Isolate the Anoxybacillus/Geobacillus phylogeny used in the reconciliation analysis by deriving it from the all-species tree used by Mowgli.

species_tree_clado <- read.tree("/Users/aesin/Desktop/Mowgli/Species_tree/Reconciled_species_tree/outputSpeciesTree.mpr")

# Get the set of Geobacillus and Anoxybacillus tips and combine into a
# single set #
anoxy_geo_clado_tips <- grep("Geobacillus_|Anoxybacillus_", species_tree_clado$tip.label, 
    value = TRUE)

# Anoxy/Geobacillus tips used to extract the Anoxy/Geobacillus subclade.
# Transform to phylo4 object #
anoxy_geo_clado <- drop.tip(species_tree_clado, setdiff(species_tree_clado$tip.label, 
    anoxy_geo_clado_tips))
anoxy_geo_clado4 <- phylo4(anoxy_geo_clado)

# Get the node labels as a data frame ##
node_lbs_df <- data.frame(labels(anoxy_geo_clado4, "all"))
internal_nodes_species <- nodeId(anoxy_geo_clado4, "internal")
names(node_lbs_df)[1] <- "Labels"

plot.phylo(anoxy_geo_clado)

2. Time-resolved tree

This is derived from a concatenated alignment of protein products of vertically inheritted genes. Here it is rooted by midpoint and the tips are relabelled to the tips used in Mowgli (i.e. the species tree above)
1. Read in the time-resolved tree

# Read in the time-resolved tree & root by midpoint #
time_consensus_tree <- read.tree(file = "/Users/aesin/Desktop/Geo_analysis/New_geobacillus_consensus/Tree/Final_consensus_out.txt")
anoxy_geo_time <- midpoint(time_consensus_tree)

plot.phylo(anoxy_geo_time)

2. Relabel the tips of the time resolved tree to match the species tree (Mowgli tips)

# Annotate the time-resolved tree with node labels imported from the species
# tree #
anoxy_geo_time_relab <- anoxy_geo_time
for (tip in anoxy_geo_time$tip.label) {
    match_clado_name <- grep(tip, anoxy_geo_clado_tips, value = T)
    time_tip_position <- match(tip, anoxy_geo_time_relab$tip.label)
    # Replace with new name #
    anoxy_geo_time_relab$tip.label[time_tip_position] <- match_clado_name
}

# Remove node labels (BS-value) so that mowgli labels can be added; convert
# to phylo4 object #
anoxy_geo_time_relab$node.label <- NULL
anoxy_geo_time_relab_4 <- phylo4(anoxy_geo_time_relab)

plot.phylo(anoxy_geo_time_relab)

3. Incogruence between the two trees:
  • Anoxybacillus is monophyletic in the time-resolved tree (can be ignored)
  • Geobacillus G1w1 and Geobacillus stearothermophilus branch in different orders
association <- matrix(ncol = 2, nrow = 22)
association[, 1] <- association[, 2] <- anoxy_geo_time_relab$tip.label
cophylo_obj <- cophylo(anoxy_geo_clado, anoxy_geo_time_relab, assoc = association)
plot.cophylo(cophylo_obj)

4. Branch mapping
  1. Here we map the internal branches from the species phylogeny to the internal branches of the time-resolved phylogeny. If a particular internal branch cannot be mapped, we will be unable to associate any transfers with that branch.
internal_nodes_time <- nodeId(anoxy_geo_time_relab_4, "internal")
unmapped_edges <- vector(mode = "list")
unmapped_count = 1

## For each internal node, match it to the nodes from the species tree ##
for (internal_time in internal_nodes_time) {
    matched = FALSE
    for (internal_species in internal_nodes_species) {
        if (setequal(tips(anoxy_geo_time_relab, internal_time), tips(anoxy_geo_clado, 
            internal_species)) == TRUE) {
            
            ## Get the label of the node from the species-cladogram tree ##
            node_label <- anoxy_geo_clado4@label[internal_species]
            
            ## Index of the internal node label in the time-resolved tree ##
            node_index <- which(names(nodeLabels(anoxy_geo_time_relab_4)) == 
                internal_time)
            
            ## Assign the name of the node to the time-resolved tree ##
            nodeLabels(anoxy_geo_time_relab_4)[node_index] <- node_label
            
            cat(paste0("Time-resolved tree node: ", internal_time, " maps to species tree node: ", 
                internal_species, "\n"))
            matched = TRUE
            break
        }
    }
    if (matched == FALSE) {
        cat(paste0("Time tree node: ", internal_time, " did not map to any species tree node\n"))
        edge_index <- which(anoxy_geo_time_relab_4@edge[, 2] == as.character(internal_time))
        unmapped_edges[[unmapped_count]] <- as.vector(anoxy_geo_time_relab_4@edge[edge_index, 
            ])
        unmapped_count = unmapped_count + 1
    }
}
## Time-resolved tree node: 23 maps to species tree node: 23
## Time tree node: 24 did not map to any species tree node
## Time-resolved tree node: 25 maps to species tree node: 33
## Time-resolved tree node: 26 maps to species tree node: 30
## Time-resolved tree node: 27 maps to species tree node: 26
## Time-resolved tree node: 28 maps to species tree node: 24
## Time-resolved tree node: 29 maps to species tree node: 27
## Time-resolved tree node: 30 maps to species tree node: 28
## Time-resolved tree node: 31 maps to species tree node: 29
## Time-resolved tree node: 32 maps to species tree node: 31
## Time-resolved tree node: 33 maps to species tree node: 32
## Time-resolved tree node: 34 maps to species tree node: 35
## Time-resolved tree node: 35 maps to species tree node: 36
## Time-resolved tree node: 36 maps to species tree node: 37
## Time-resolved tree node: 37 maps to species tree node: 39
## Time-resolved tree node: 38 maps to species tree node: 42
## Time-resolved tree node: 39 maps to species tree node: 43
## Time-resolved tree node: 40 maps to species tree node: 40
## Time-resolved tree node: 41 maps to species tree node: 41
## Time-resolved tree node: 42 maps to species tree node: 38
## Time tree node: 43 did not map to any species tree node
  1. Because there is incongruence between species and time-resolved trees, there are two edges with which we will be unable to associate transfers
anoxy_geo_time_relab <- as(anoxy_geo_time_relab_4, "phylo")
plot(anoxy_geo_time_relab)
# Colour in the 'missing' incongruent edges #
for (i in 1:length(unmapped_edges)) {
    edge <- unmapped_edges[[i]]
    PlotSegmentByNodes(edge[1], edge[2], "red")
}

  1. Now that all the internal nodes of the time-resolved tree have been relabelled to match the species tree (where there is no incongruence), a list of all the nodes is pulled out. The row number of this dataframe corresponds to the descendent node of an edge, i.e X or Y in the below case
    ………—– X
    …..—-| Z
    ………—– Y
# Prepare the count table for the time-resolved tree as for the
# species-derived prune above ##
node_labels_dfs <- list(clado = data.frame(labels(anoxy_geo_clado4, "all")), 
    time = data.frame(labels(anoxy_geo_time_relab_4, "all")))
node_labels_dfs <- lapply(node_labels_dfs, setnames, "Node_labels")
lapply(node_labels_dfs, head)
## $clado
##                           Node_labels
## 1 Anoxybacillus_flavithermus_WK1_3602
## 2    Anoxybacillus_kamchatkensis_3603
## 3       Anoxybacillus_tepidamans_3605
## 4 Geobacillus_caldoxylosilyticus_3606
## 5           Geobacillus_sp_WCH70_3607
## 6          Geobacillus_sp_Y41MC1_3608
## 
## $time
##                           Node_labels
## 1 Anoxybacillus_flavithermus_WK1_3602
## 2 Geobacillus_stearothermophilus_3619
## 3            Geobacillus_sp_G1w1_3618
## 4            Geobacillus_vulcani_3620
## 5          Geobacillus_sp_C56_T3_3630
## 6        Geobacillus_sp_Y412MC52_3631

2. Associate transfers with branches

1. Prep output dataframes

The list contains two dataframes - one each to store the number of transfers per branch of the clado and time-resolved trees. The order of edges in these dataframes is the same as the order of node labels.

# Create dataframe in which the num. transfers at each penalty for each edge
# will be stored #
edge_dfs <- list(clado = data.frame(anoxy_geo_clado$edge), time = data.frame(anoxy_geo_time$edge))
edge_dfs <- lapply(edge_dfs, setnames, c("E1", "E2"))
edge_dfs <- lapply(edge_dfs, function(x) cbind(x, Subgroup = NA))

# For each dataframe, create columns for each penalty - fill with 0s as
# placeholders #
penalty_names <- lapply(as.list(penalties), function(x) paste0("T", x))
edge_dfs <- lapply(edge_dfs, function(x) {
    for (penalty in penalty_names) {
        x[eval(penalty)] <- 0
    }
    x
})
lapply(edge_dfs, head)
## $clado
##   E1 E2 Subgroup T4 T5 T6
## 1 23 24       NA  0  0  0
## 2 24  1       NA  0  0  0
## 3 24  2       NA  0  0  0
## 4 23 25       NA  0  0  0
## 5 25  3       NA  0  0  0
## 6 25 26       NA  0  0  0
## 
## $time
##   E1 E2 Subgroup T4 T5 T6
## 1 28  1       NA  0  0  0
## 2 28 22       NA  0  0  0
## 3 43 28       NA  0  0  0
## 4 43 21       NA  0  0  0
## 5 31 17       NA  0  0  0
## 6 31 18       NA  0  0  0
2. Read in transfers.

We read in the per-penalty transfer tables - these contain all transfers into (Anoxy)Geobacillus with associated edges, i.e. two nodes that are presented in the Mowgli species tree format (e.g. 3644). Using the node label table, we can then associate the transfers with edges in our two output dataframes.

# Directory containing the per-penalty transfer event files #
input_edge_dir <- "/Users/aesin/Desktop/Mowgli/Refined_events/Scenarios_1_2/Events/"
for (penalty in penalties) {
    
    # Read in the refined receptor edge list for a particular penalty #
    # receptor_df <- read.table(file = paste0(input_edge_dir, 'T', penalty,
    # '_full_lHGT_events', file_name_add, '.tsv'), header = FALSE, sep = '\t')
    transfer_data_df <- read.table(file = paste0(input_edge_dir, "T", penalty, 
        "_full_lHGT_events.tsv"), header = TRUE, sep = "\t")
    
    # Column name for this penalty #
    cname <- paste0("T", penalty)
    
    # For each transfer, we can identify the edge #
    at_root = 0
    for (i in 1:nrow(transfer_data_df)) {
        row <- transfer_data_df[i, ]
        receptor_nodes <- str_split(row$Receptor_nodes, pattern = " ", simplify = T)
        
        # If one of the nodes of the edge is '3728' - this is the Anoxy/Geobacillus
        # root node #
        if (receptor_nodes[1] == 3728 || receptor_nodes[2] == 3728) {
            at_root = at_root + 1
        } else {
            # Pull out row number (indication of phylogeny edge) corresponding to the
            # Mowgli tree label.
            edge_2_species <- grep(receptor_nodes[2], node_labels_dfs$clado$Node_labels)
            edge_2_time <- grep(receptor_nodes[2], node_labels_dfs$time$Node_labels)
            
            ## Fill in the count for the species-derived tree ##
            edge_dfs$clado[, cname][which(edge_dfs$clado$E2 == edge_2_species)] = edge_dfs$clado[, 
                cname][which(edge_dfs$clado$E2 == edge_2_species)] + 1
            
            ## Fill in the count for the time-derived tree ##
            edge_dfs$time[, cname][which(edge_dfs$time$E2 == edge_2_time)] = edge_dfs$time[, 
                cname][which(edge_dfs$time$E2 == edge_2_time)] + 1
        }
    }
}
lapply(edge_dfs, head)
## $clado
##   E1 E2 Subgroup  T4 T5 T6
## 1 23 24       NA  31 18 15
## 2 24  1       NA  20 15 13
## 3 24  2       NA  18 12 10
## 4 23 25       NA  49 35 20
## 5 25  3       NA  61 45 30
## 6 25 26       NA 101 90 87
## 
## $time
##   E1 E2 Subgroup T4 T5 T6
## 1 28  1       NA 20 15 13
## 2 28 22       NA 18 12 10
## 3 43 28       NA 31 18 15
## 4 43 21       NA 61 45 30
## 5 31 17       NA 74 70 61
## 6 31 18       NA 63 60 55
3. Summary statistics

Some statistics for the data so far

cat("There will be something here later")
## There will be something here later

3. Subdividing the Geobacillus clade

Aliyu et al 2016 divides the Geobacillus phylogeny into a number of species subgroups (fig 3). I applied that classification to the genomes used in this analysis. Branches separating species/tips belonging to the same subgroup are considered to not have undergone substantial differentiation and purifying selection.

1. Read in subdivisions
# Read in the table constructed based on the publication #
subspecies_groupings <- read.csv(file = "/Users/aesin/Desktop/Geo_analysis/Geo_omes/Subspecies_geo_groups.csv", 
    header = TRUE, sep = ",")

# Get those tips that have a fellow tip in the same subgroup. Isolate all
# the subgroups that have more than one member #
subspecies_only <- subset(subspecies_groupings, duplicated(subspecies_groupings$Group) | 
    duplicated(subspecies_groupings$Group, fromLast = TRUE))
unique_subgroups <- unique(subspecies_only$Group)
subspecies_only
##                                     Species Group
## 1    Geobacillus_thermodenitrificans_NG80_2    I9
## 2                    Geobacillus_sp_G11MC16    I9
## 7                      Geobacillus_sp_GHH01    I2
## 8                     Geobacillus_sp_WSUCF1    I2
## 9                     Geobacillus_sp_C56_T3    I1
## 10                  Geobacillus_sp_Y412MC61    I1
## 11                  Geobacillus_sp_Y412MC52    I1
## 12          Geobacillus_kaustophilus_HTA426    I1
## 13                      Geobacillus_sp_MAS1    I1
## 14  Geobacillus_thermoleovorans_CCB_US3_UF5    I1
## 17 Geobacillus_thermoglucosidasius_C56_YS93   II3
## 18                    Geobacillus_sp_Y41MC1   II3
2. Per edge subgroups
# For each group with more than one member, identify the tips. Find all the
# descendant nodes from the common ancestor of the subgroup (they are
# monophyletic). Using those nodes, identify all the edges (branches) that
# we will classify as being a subgroup branch (1). All other branches are
# seperate groups (0)
anoxy_geo_time4 <- phylo4(anoxy_geo_time)
for (group in unique_subgroups) {
    tips_in_subgroup <- as.vector(subspecies_only$Species[which(subspecies_only$Group == 
        group)])
    descend_nodes <- as.vector(descendants(anoxy_geo_time4, MRCA(anoxy_geo_time4, 
        tips_in_subgroup), "all"))
    
    for (node in descend_nodes) {
        edge_dfs$time$Subgroup[which(edge_dfs$time$E2 == node)] <- 1
    }
}
edge_dfs$time$Subgroup[is.na(edge_dfs$time$Subgroup)] <- 0
head(edge_dfs$time)
##   E1 E2 Subgroup T4 T5 T6
## 1 28  1        0 20 15 13
## 2 28 22        0 18 12 10
## 3 43 28        0 31 18 15
## 4 43 21        0 61 45 30
## 5 31 17        1 74 70 61
## 6 31 18        1 63 60 55
3. Group/subgroup tree
# The subgroup branches are highlighted in red #
plotBranchbyTrait(anoxy_geo_time, edge_dfs$time$Subgroup, method = "edges", 
    title = "Subgroup branches")

# for (i in 1:nrow(edge_dfs$time)) { edgelabels(i, i, adj = c(0.5, -0.25),
# bg = 'white', frame = 'none', cex = 0.7) }

4. Average transfers per branch

In the next set of calculations we use the average number of transfers per branch. This way we can include the transfer data for all penalties simultaneously.

1. Average over penalties

For each transfer penalty, make the transfers per branch a percentage of the total transfers for that penalty. Then we can take a mean over all the penalties.

# Save a copy of the dfs with the raw numbers #
edge_raw_dfs <- edge_dfs

# Make the transfers in at each branch a fraction of the total transfers #
edge_dfs <- lapply(edge_dfs, function(x) cbind(x[, 1:3], sweep(x[, -1:-3], 2, 
    colSums(x[, -1:-3]), "/") * 100))

# Make a mean column - averaging across all penalties #
edge_dfs <- lapply(edge_dfs, function(x) cbind(x, mean = rowMeans(x[, -1:-3])))

Example of the dataframe made:

lapply(edge_dfs, head)
## $clado
##   E1 E2 Subgroup        T4        T5        T6      mean
## 1 23 24       NA 1.5585721 1.0532475 0.9986684 1.2034960
## 2 24  1       NA 1.0055304 0.8777063 0.8655126 0.9162498
## 3 24  2       NA 0.9049774 0.7021650 0.6657790 0.7576404
## 4 23 25       NA 2.4635495 2.0479813 1.3315579 1.9476962
## 5 25  3       NA 3.0668678 2.6331188 1.9973369 2.5657745
## 6 25 26       NA 5.0779286 5.2662376 5.7922770 5.3788144
## 
## $time
##   E1 E2 Subgroup        T4        T5        T6      mean
## 1 28  1        0 1.0357328 0.9009009 0.8837525 0.9401287
## 2 28 22        0 0.9321595 0.7207207 0.6798097 0.7775633
## 3 43 28        0 1.6053858 1.0810811 1.0197145 1.2353938
## 4 43 21        0 3.1589850 2.7027027 2.0394290 2.6337055
## 5 31 17        1 3.8322113 4.2042042 4.1468389 4.0610848
## 6 31 18        1 3.2625583 3.6036036 3.7389531 3.5350383
2. Transfer on trees

Plot the fraction of all transfers per branch on the clado species tree

# Plot the tree with edges coloured by the fraction #
plotBranchbyTrait(anoxy_geo_clado, edge_dfs$clado$mean, method = "edges", legend = 5, 
    title = "Fraction of transfers")

# Anotate the branches with the numeric values #
for (i in 1:nrow(edge_dfs$clado)) {
    entry <- edge_dfs$clado[i, ]
    
    annotate_transfer_num(entry, i, "mean")
}

Now plot the same fractions on the time-resolved tree - note the 0s on the incongruent edges (in black)

# Plot the tree with edges coloured by the fraction #
plotBranchbyTrait(anoxy_geo_time, edge_dfs$time$mean, method = "edges", legend = 0.1, 
    title = "Fraction of transfers")

# Anotate the branches with the numeric values #
for (i in 1:nrow(edge_dfs$time)) {
    entry <- edge_dfs$time[i, ]
    
    annotate_transfer_num(entry, i, "mean")
}

for (i in 1:length(unmapped_edges)) {
    edge <- unmapped_edges[[i]]
    PlotSegmentByNodes(edge[1], edge[2], "black")
}

3. Branch length correlation (1)
  1. Add the branch lengths from the time-resolved tree to the time-resolved dataframe. We take these directly from the Time-resolved tree: 2. Time-resolved tree. At same time, add a point index, a mapping for each branch so any point in the correlation point can be easily associated with a branch on the tree.
edge_dfs$time <- cbind(edge_dfs$time, Branch_len = anoxy_geo_time$edge.length, 
    Index = rownames(edge_dfs$time))
head(edge_dfs$time)
##   E1 E2 Subgroup        T4        T5        T6      mean  Branch_len Index
## 1 28  1        0 1.0357328 0.9009009 0.8837525 0.9401287 0.040005508     1
## 2 28 22        0 0.9321595 0.7207207 0.6798097 0.7775633 0.038648816     2
## 3 43 28        0 1.6053858 1.0810811 1.0197145 1.2353938 0.355284380     3
## 4 43 21        0 3.1589850 2.7027027 2.0394290 2.6337055 0.148146545     4
## 5 31 17        1 3.8322113 4.2042042 4.1468389 4.0610848 0.001683514     5
## 6 31 18        1 3.2625583 3.6036036 3.7389531 3.5350383 0.001943465     6
  1. Now we trim all the branches that we do not want to correlate - this includes all Anoxybacillus-related edges and also the incogruent branch within the Geobacillus clade. The red branches will be excluded from the correlation between number of transfers and branch length.
# First find all nodes related to Anoxybacillus
anoxybacillus_tips <- grep("Anoxybacillus", anoxy_geo_time$tip.label, value = TRUE)
anoxybacillus_nodes <- as.vector(descendants(anoxy_geo_time4, MRCA(anoxy_geo_time4, 
    anoxybacillus_tips), "ALL"))

# Subset our list of edge_dfs to include a geo_only data frame
removed_edges <- edge_dfs$time[edge_dfs$time$E2 %in% as.character(anoxybacillus_nodes), 
    1:2]
edge_dfs$time_geo <- edge_dfs$time[!edge_dfs$time$E2 %in% as.character(anoxybacillus_nodes), 
    ]
removed_edges_l <- split(removed_edges, seq(nrow(removed_edges)))

# For each of the unmapped edges, identified previously, remove that edge as
# well (one is already gone as it was Anoxybacillus!) - subset to new
# dataframe.
edge_dfs$time_geo_congruent <- edge_dfs$time_geo
for (nodes in unmapped_edges) {
    removed_edges <- rbind(removed_edges, edge_dfs$time_geo_congruent[which(edge_dfs$time_geo_congruent$E1 %in% 
        nodes[1] & edge_dfs$time_geo_congruent$E2 %in% nodes[2]), 1:2])
    edge_dfs$time_geo_congruent <- edge_dfs$time_geo_congruent[!(edge_dfs$time_geo_congruent$E1 %in% 
        nodes[1] & edge_dfs$time_geo_congruent$E2 %in% nodes[2]), ]
}

plot(anoxy_geo_time)
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]], 
    "red")))

4. Branch length correlation (2)

Perform a correlation on the Geobacillus branches. Outlier identified as point 39 - corresponds to the branch that seperates two major Geobacillus subdivisions. The two subdivisions share a very different GC content in the extrant taxa.

# Write out a seperate dataframe with the subgroup column relabelled
# logically #
congruent_geo_cor_df <- edge_dfs$time_geo_congruent
congruent_geo_cor_df$Subgroup <- as.logical(congruent_geo_cor_df$Subgroup)
ggplot(congruent_geo_cor_df, aes(x = Branch_len, y = mean, label = Index, color = Subgroup)) + 
    geom_point(size = 3) + geom_text(aes(label = Index), hjust = 0, vjust = -1) + 
    xlab("Branch length") + ylab("Mean fraction of HGTs across penalties") + 
    scale_color_manual(values = c("blue", "red"))

This tree outlines the various conditions that are used in making the correlations between branch length and HGTs

# Get the edges of the outlier branch #
outlier_edge <- as.numeric(congruent_geo_cor_df[(congruent_geo_cor_df$Index == 
    39), 1:2])

# Replot the time-resolved tree with the excluded branches highlighted in
# black, and the branches labelled according to their index value #
plotBranchbyTrait(anoxy_geo_time, edge_dfs$time$Subgroup, method = "edges", 
    title = "Subgroup branches")

# Label the 'removed edges' black
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]], 
    "black")))
# Label the outlier edge green #
invisible(PlotSegmentByNodes(outlier_edge[1], outlier_edge[2], "green"))

# Label branches according to index #
for (i in as.vector(congruent_geo_cor_df$Index)) {
    edgelabels(i, as.numeric(i), adj = c(0.5, -0.25), bg = "white", frame = "none", 
        cex = 0.7)
}

# Add legend #
legend("bottomright", legend = c("Intergroup", "Intragroup", "Outlier", "Excluded"), 
    text.col = c("blue", "red", "green", "black"))

Correlations are peformed for a number of conditions - this is still using the fraction of transfers averaged over transfer penalties 4, 5, and 6.

# Create a copy of the df with outlier removed #
congruent_geo_cor_out_df <- congruent_geo_cor_df[! (congruent_geo_cor_df$Index == 39),]

# Correlations - make df and fill it with different combinations #
cor_df <- data.frame(               Condition=character(),
                                    Pearson=double(),
                                    Spearman=double(),
                                    Rsquared=double(), 
                                    stringsAsFactors = FALSE)

# All together - outlier kept #

cor_df <- rbind(cor_df, data.frame( Condition = "All with outlier",
                                    Pearson = cor(congruent_geo_cor_df$Branch_len, congruent_geo_cor_df$mean, method = "pearson"),
                                    Spearman = cor(congruent_geo_cor_df$Branch_len, congruent_geo_cor_df$mean, method = "spearman"),
                                    Rsquared = summary(lm(congruent_geo_cor_df$Branch_len ~ congruent_geo_cor_df$mean))$r.squared))

# All together - outlier removed #

cor_df <- rbind(cor_df, data.frame( Condition = "All no outlier",
                                    Pearson = cor(congruent_geo_cor_out_df$Branch_len, congruent_geo_cor_out_df$mean, method = "pearson"),
                                    Spearman = cor(congruent_geo_cor_out_df$Branch_len, congruent_geo_cor_out_df$mean, method = "spearman"),
                                    Rsquared = summary(lm(congruent_geo_cor_out_df$Branch_len ~ congruent_geo_cor_out_df$mean))$r.squared))

# Split by Subgroup #
# Intergroup #
cor_df <- rbind(cor_df, data.frame( Condition = "Intergroup no outlier",
                                    Pearson = cor(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == TRUE)], congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == TRUE)], method = "pearson"),
                                    Spearman = cor(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == TRUE)], congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == TRUE)], method = "spearman"),
                                    Rsquared = summary(lm(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == TRUE)] ~ congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == TRUE)]))$r.squared))

# Intragroup #
cor_df <- rbind(cor_df, data.frame( Condition = "Intragroup no outlier",
                                    Pearson = cor(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == FALSE)], congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == FALSE)], method = "pearson"),
                                    Spearman = cor(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == FALSE)], congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == FALSE)], method = "spearman"),
                                    Rsquared = summary(lm(congruent_geo_cor_out_df$Branch_len[! (congruent_geo_cor_out_df$Subgroup == FALSE)] ~ congruent_geo_cor_out_df$mean[! (congruent_geo_cor_out_df$Subgroup == FALSE)]))$r.squared))

cor_df[,2:4] <- cbind(round(cor_df$Pearson, digits = 4), round(cor_df$Spearman, digits = 4), round(cor_df$Rsquared, digits = 4))
print(cor_df)
##               Condition Pearson Spearman Rsquared
## 1      All with outlier  0.3444   0.5328   0.1186
## 2        All no outlier  0.7072   0.5910   0.5002
## 3 Intergroup no outlier  0.8331   0.8053   0.6941
## 4 Intragroup no outlier  0.3760   0.4206   0.1414

5. Per penalty transfers per branch

In this section we go back and calculate per penalty transfers into Geobacillus (using the raw numeric values). We produce the correlations as in 4. Average transfers per branch, except independently for each penalty. Sections 2, 3, and 4 are identical except for the change in transfer penalty.

1. Branch length & trim
  1. We add the branch lengths and index to the dataframe containing the raw numbers of transfers per penalty
edge_raw_dfs$time <- cbind(edge_raw_dfs$time, Branch_len = anoxy_geo_time$edge.length, 
    Index = rownames(edge_raw_dfs$time))
head(edge_raw_dfs$time)
##   E1 E2 Subgroup T4 T5 T6  Branch_len Index
## 1 28  1        0 20 15 13 0.040005508     1
## 2 28 22        0 18 12 10 0.038648816     2
## 3 43 28        0 31 18 15 0.355284380     3
## 4 43 21        0 61 45 30 0.148146545     4
## 5 31 17        1 74 70 61 0.001683514     5
## 6 31 18        1 63 60 55 0.001943465     6
  1. Now we trim all the branches that we do not want to correlate - this includes all Anoxybacillus-related edges and also the incogruent branch within the Geobacillus clade. The red branches will be excluded from the correlation between number of transfers and branch length.
# First find all nodes related to Anoxybacillus
anoxybacillus_tips <- grep("Anoxybacillus", anoxy_geo_time$tip.label, value = TRUE)
anoxybacillus_nodes <- as.vector(descendants(anoxy_geo_time4, MRCA(anoxy_geo_time4, 
    anoxybacillus_tips), "ALL"))

# Subset our list of edge_dfs to include a geo_only data frame
removed_edges <- edge_raw_dfs$time[edge_raw_dfs$time$E2 %in% as.character(anoxybacillus_nodes), 
    1:2]
edge_raw_dfs$time_geo <- edge_raw_dfs$time[!edge_raw_dfs$time$E2 %in% as.character(anoxybacillus_nodes), 
    ]
removed_edges_l <- split(removed_edges, seq(nrow(removed_edges)))

# For each of the unmapped edges, identified previously, remove that edge as
# well (one is already gone as it was Anoxybacillus!) - subset to new
# dataframe.
edge_raw_dfs$time_geo_congruent <- edge_raw_dfs$time_geo
for (nodes in unmapped_edges) {
    removed_edges <- rbind(removed_edges, edge_raw_dfs$time_geo_congruent[which(edge_raw_dfs$time_geo_congruent$E1 %in% 
        nodes[1] & edge_raw_dfs$time_geo_congruent$E2 %in% nodes[2]), 1:2])
    edge_raw_dfs$time_geo_congruent <- edge_raw_dfs$time_geo_congruent[!(edge_raw_dfs$time_geo_congruent$E1 %in% 
        nodes[1] & edge_raw_dfs$time_geo_congruent$E2 %in% nodes[2]), ]
}

# Extract the time_geo_congruent as a separate dataframe, and also make a
# copy without the outlier point #
congruent_geo_cor_raw_df <- edge_raw_dfs$time_geo_congruent
  1. As in 3. Branch length correlation (1), the branches highlighted in red correspond to those edges not used in the correlations
plot(anoxy_geo_time)
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]], 
    "red")))

2. Transfer = 4

Tree coloured on a scale according to the number of transfers at each branch. Numbers above each represent number of transfer events. In black, branches which are excluded from the correlations.

# Plot the tree with edges coloured by the fraction #
plotBranchbyTrait(anoxy_geo_time, edge_raw_dfs$time$T4, method = "edges", legend = 0.25, 
    title = "Number of transfers")

# Anotate the branches with the numeric values #
for (i in 1:nrow(edge_raw_dfs$time)) {
    entry <- edge_raw_dfs$time[i, ]
    
    annotate_transfer_num(entry, i, "T4")
}

# Colour the trimmed edges black #
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]], 
    "black")))

Plot the branch lengths vs raw number of transfers at branch

# Make the subgrouping call logical rather than binary #
congruent_geo_cor_raw_df$Subgroup <- as.logical(congruent_geo_cor_df$Subgroup)
# Plot the scatter plot
ggplot(congruent_geo_cor_raw_df, aes(x = Branch_len, y = T4, label = Index, 
    color = Subgroup)) + geom_point(size = 3) + geom_text(aes(label = Index), 
    hjust = 0, vjust = -1) + xlab("Branch length") + ylab("Raw number of HGTs at penalty = 4") + 
    scale_color_manual(values = c("blue", "red"))

Correlation calculations as in 4. Branch length correlation (2)

T4_cor_df <- CalcCorrLengthTransfers(congruent_geo_cor_raw_df, "T4", 39)
print(T4_cor_df)
##               Condition Pearson Spearman Rsquared
## 1      All with outlier  0.3339   0.5343   0.1115
## 2        All no outlier  0.7281   0.6114   0.5301
## 3 Intergroup no outlier  0.8273   0.7995   0.6845
## 4 Intragroup no outlier  0.3613   0.4197   0.1305
3. Transfer = 5

Tree coloured on a scale according to the number of transfers at each branch. Numbers above each represent number of transfer events. In black, branches which are excluded from the correlations.

# Plot the tree with edges coloured by the fraction #
plotBranchbyTrait(anoxy_geo_time, edge_raw_dfs$time$T5, method = "edges", legend = 0.25, 
    title = "Number of transfers")

# Anotate the branches with the numeric values #
for (i in 1:nrow(edge_raw_dfs$time)) {
    entry <- edge_raw_dfs$time[i, ]
    
    annotate_transfer_num(entry, i, "T5")
}

# Colour the trimmed edges black #
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]], 
    "black")))

devSVG(file = "/Users/aesin/Desktop/SMBE_poster/Tree_test.svg", height = 10, 
    width = 10)

plotBranchbyTrait(anoxy_geo_time, edge_raw_dfs$time$T5, method = "edges", legend = 0.25, 
    title = "Number of transfers")

# Anotate the branches with the numeric values #
for (i in 1:nrow(edge_raw_dfs$time)) {
    entry <- edge_raw_dfs$time[i, ]
    
    annotate_transfer_num(entry, i, "T5")
}

# Colour the trimmed edges black #
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]], 
    "black")))

dev.off()
## quartz_off_screen 
##                 2

Plot the branch lengths vs raw number of transfers at branch

# Make the subgrouping call logical rather than binary #
congruent_geo_cor_raw_df$Subgroup <- as.logical(congruent_geo_cor_df$Subgroup)
# Plot the scatter plot

ggplot(congruent_geo_cor_raw_df, aes(x = Branch_len, y = T5, label = Index, 
    color = Subgroup)) + geom_point(size = 3) + geom_text(aes(label = Index), 
    hjust = 0, vjust = -1) + xlab("Branch length") + ylab("Raw number of HGTs at penalty = 5") + 
    scale_color_manual(values = c("blue", "red"))

devSVG(file = "/Users/aesin/Desktop/SMBE_poster/Correlation_T5.svg", height = 10, 
    width = 10)

ggplot(congruent_geo_cor_raw_df, aes(x = Branch_len, y = T5, label = Index, 
    color = Subgroup)) + geom_point(size = 3) + geom_text(aes(label = Index), 
    hjust = 0, vjust = -1) + xlab("Branch length") + ylab("Raw number of HGTs at penalty = 5") + 
    scale_color_manual(values = c("blue", "red"))

dev.off()
## quartz_off_screen 
##                 2

Correlation calculations as in 4. Branch length correlation (2)

T5_cor_df <- CalcCorrLengthTransfers(congruent_geo_cor_raw_df, "T5", 39)
print(T5_cor_df)
##               Condition Pearson Spearman Rsquared
## 1      All with outlier  0.3343   0.5128   0.1118
## 2        All no outlier  0.7052   0.5923   0.4973
## 3 Intergroup no outlier  0.8385   0.8037   0.7030
## 4 Intragroup no outlier  0.3847   0.4268   0.1480
4. Transfer = 6

Tree coloured on a scale according to the number of transfers at each branch. Numbers above each represent number of transfer events. In black, branches which are excluded from the correlations.

# Plot the tree with edges coloured by the fraction #
plotBranchbyTrait(anoxy_geo_time, edge_raw_dfs$time$T6, method = "edges", legend = 0.25, 
    title = "Number of transfers")

# Anotate the branches with the numeric values #
for (i in 1:nrow(edge_raw_dfs$time)) {
    entry <- edge_raw_dfs$time[i, ]
    
    annotate_transfer_num(entry, i, "T6")
}

# Colour the trimmed edges black #
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]], 
    "black")))

Plot the branch lengths vs raw number of transfers at branch

# Make the subgrouping call logical rather than binary #
congruent_geo_cor_raw_df$Subgroup <- as.logical(congruent_geo_cor_df$Subgroup)
# Plot the scatter plot
ggplot(congruent_geo_cor_raw_df, aes(x = Branch_len, y = T6, label = Index, 
    color = Subgroup)) + geom_point(size = 3) + geom_text(aes(label = Index), 
    hjust = 0, vjust = -1) + xlab("Branch length") + ylab("Raw number of HGTs at penalty = 6") + 
    scale_color_manual(values = c("blue", "red"))

Correlation calculations as in 4. Branch length correlation (2)

T6_cor_df <- CalcCorrLengthTransfers(congruent_geo_cor_raw_df, "T6", 39)
print(T6_cor_df)
##               Condition Pearson Spearman Rsquared
## 1      All with outlier  0.3595   0.5145   0.1293
## 2        All no outlier  0.6819   0.5540   0.4650
## 3 Intergroup no outlier  0.8239   0.7831   0.6788
## 4 Intragroup no outlier  0.3787   0.4180   0.1434
5. Summary correlations
added_penalty_col <- lapply(penalty_names, function(x) {
    df <- eval(as.name(paste0(x, "_cor_df")))
    penalty <- str_sub(x, 2, 2)
    new_col <- data.frame(Penalty = rep(penalty, nrow(df)))
    df <- cbind(df, new_col)
})

total_cor_df <- as.data.frame(rbindlist(added_penalty_col))
print(total_cor_df)
##                Condition Pearson Spearman Rsquared Penalty
## 1       All with outlier  0.3339   0.5343   0.1115       4
## 2         All no outlier  0.7281   0.6114   0.5301       4
## 3  Intergroup no outlier  0.8273   0.7995   0.6845       4
## 4  Intragroup no outlier  0.3613   0.4197   0.1305       4
## 5       All with outlier  0.3343   0.5128   0.1118       5
## 6         All no outlier  0.7052   0.5923   0.4973       5
## 7  Intergroup no outlier  0.8385   0.8037   0.7030       5
## 8  Intragroup no outlier  0.3847   0.4268   0.1480       5
## 9       All with outlier  0.3595   0.5145   0.1293       6
## 10        All no outlier  0.6819   0.5540   0.4650       6
## 11 Intergroup no outlier  0.8239   0.7831   0.6788       6
## 12 Intragroup no outlier  0.3787   0.4180   0.1434       6
total_cor_df.molten <- melt(total_cor_df, measure.vars = c("Pearson", "Spearman", 
    "Rsquared"), variable.name = "Correlation", value.name = "Score")

## Replot the tree to show the various branch identities ##
plotBranchbyTrait(anoxy_geo_time, edge_dfs$time$Subgroup, method = "edges", 
    title = "Subgroup branches")
# Label the 'removed edges' black
invisible(apply(removed_edges, 1, function(x) PlotSegmentByNodes(x[[1]], x[[2]], 
    "black")))
# Label the outlier edge green #
invisible(PlotSegmentByNodes(outlier_edge[1], outlier_edge[2], "green"))

# Label branches according to index #
for (i in as.vector(congruent_geo_cor_df$Index)) {
    edgelabels(i, as.numeric(i), adj = c(0.5, -0.25), bg = "white", frame = "none", 
        cex = 0.7)
}
# Add legend #
legend("bottomright", legend = c("Intergroup", "Intragroup", "Outlier", "Excluded"), 
    text.col = c("blue", "red", "green", "black"))

# Plot the correlations #
ggplot(total_cor_df.molten, aes(x = Condition, y = Score, variable = Penalty, 
    colour = Penalty)) + geom_point(size = 3) + facet_wrap("Correlation", nrow = 1) + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12), axis.text.y = element_text(size = 12), 
        strip.text.x = element_text(size = 12), axis.title = element_text(size = 12)) + 
    scale_y_continuous(limits = c(0, 1))